{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ImageBind\n",
    "\n",
    "This demo shows how to use the ImageBind to compute alignment between modalities.\n",
    "\n",
    "**Install**\n",
    "\n",
    "Please see [ImageBind Github](https://github.com/facebookresearch/ImageBind) repository for installation instructions. Basically, \n",
    "\n",
    "\n",
    "```\n",
    "git clone https://github.com/facebookresearch/ImageBind.git\n",
    "\n",
    "pip install -r requirements.txt\n",
    "```\n",
    "\n",
    "However, if an error is encountered, install the package one by one. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import data\n",
    "import torch\n",
    "from models import imagebind_model\n",
    "from models.imagebind_model import ModalityType"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Compute Alignment\n",
    "\n",
    "In this example, 3 modalities are shown. The alignment is computed and printed out. Note two modalities are aligned when their dot product is almost 1. \n",
    "\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vision x Text:  tensor([[9.9763e-01, 2.3512e-03, 1.8975e-05],\n",
      "        [3.3908e-05, 9.9994e-01, 2.3960e-05],\n",
      "        [4.7818e-05, 1.3439e-02, 9.8651e-01]], device='cuda:7')\n",
      "Audio x Text:  tensor([[1., 0., 0.],\n",
      "        [0., 1., 0.],\n",
      "        [0., 0., 1.]], device='cuda:7')\n",
      "Vision x Audio:  tensor([[0.8067, 0.1091, 0.0842],\n",
      "        [0.1033, 0.7895, 0.1072],\n",
      "        [0.0018, 0.0022, 0.9960]], device='cuda:7')\n"
     ]
    }
   ],
   "source": [
    "text_list=[\"A dog.\", \"A car\", \"A bird\"]\n",
    "image_paths=[\".assets/dog_image.jpg\", \".assets/car_image.jpg\", \".assets/bird_image.jpg\"]\n",
    "audio_paths=[\".assets/dog_audio.wav\", \".assets/car_audio.wav\", \".assets/bird_audio.wav\"]\n",
    "\n",
    "device = \"cuda:7\" if torch.cuda.is_available() else \"cpu\"\n",
    "\n",
    "# Instantiate model\n",
    "model = imagebind_model.imagebind_huge(pretrained=True)\n",
    "model.eval()\n",
    "model.to(device)\n",
    "\n",
    "# Load data\n",
    "inputs = {\n",
    "    ModalityType.TEXT: data.load_and_transform_text(text_list, device),\n",
    "    ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device),\n",
    "    ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, device),\n",
    "}\n",
    "\n",
    "with torch.no_grad():\n",
    "    embeddings = model(inputs)\n",
    "\n",
    "print(\n",
    "    \"Vision x Text: \",\n",
    "    torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=-1),\n",
    ")\n",
    "print(\n",
    "    \"Audio x Text: \",\n",
    "    torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1),\n",
    ")\n",
    "print(\n",
    "    \"Vision x Audio: \",\n",
    "    torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.AUDIO].T, dim=-1),\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
